# Polynomial Regression (application to the US population)
# Dr. M. Baron, Statistical Machine Learning class, STAT-427/627

import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.api as sm
import numpy as np

# Load the data
data = pd.read_csv("C:\\Users\\baron\\Documents\\Teach\\627 Statistical Machine Learning\\Data\\USpop.csv")
print(data.columns)

Index(['Year', 'Population'], dtype='object')

# Plot the data
plt.scatter(data['Year'], data['Population'])
plt.xlabel('Year')
plt.ylabel('Population')
plt.show()

# LINEAR MODEL

# Define the linear model
X_lin = sm.add_constant(data['Year'])  # Adds a constant term to the predictor
lin_model = sm.OLS(data['Population'], X_lin).fit()

# Summary of the linear model
print(lin_model.summary())

                            OLS Regression Results                            
==============================================================================
Dep. Variable:             Population   R-squared:                       0.921
Model:                            OLS   Adj. R-squared:                  0.918
Method:                 Least Squares   F-statistic:                     257.9
Date:                Thu, 01 Aug 2024   Prob (F-statistic):           1.24e-13
Time:                        16:29:56   Log-Likelihood:                -114.70
No. Observations:                  24   AIC:                             233.4
Df Residuals:                      22   BIC:                             235.8
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const      -2600.2970    169.096    -15.378      0.000   -2950.980   -2249.614
Year           1.4245      0.089     16.059      0.000       1.241       1.609
==============================================================================
Omnibus:                        3.921   Durbin-Watson:                   0.096
Prob(Omnibus):                  0.141   Jarque-Bera (JB):                2.503
Skew:                           0.597   Prob(JB):                        0.286
Kurtosis:                       1.962   Cond. No.                     5.25e+04
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 5.25e+04. This might indicate that there are
strong multicollinearity or other numerical problems.

# Plot the linear fit
plt.scatter(data['Year'], data['Population'])
plt.plot(data['Year'], lin_model.predict(X_lin), color='red', linewidth=3)
plt.xlabel('Year')
plt.ylabel('Population')
plt.show()

# Clearly, the linear model is too inflexible and restrictive, it does not provide a good fit.
# This is underfitting. Notice, however, that R2 in this regression is 0.9193. Without looking
# at the plot, we could have assumed that the model is very good!

# Prediction for the year 2030 using the linear model
pred_lin_2030 = lin_model.predict([1, 2030])[0]
print(f"Linear model prediction for 2030: {pred_lin_2030}")

Linear model prediction for 2030: 291.5173913043477

# This is obviously a poor prediction. The US population was already 331 million during the most
# recent Census. So, we probably omitted an important predictor. Residual plots will help us
# determine which one.

# Let’s produce various related plots. Partition the graphics window into 4 parts and use “plot”.

# Residual plots
fig = sm.graphics.plot_regress_exog(lin_model, 'Year')
plt.show()

# The first plot shows that a quadratic term has been omitted although
# it is important in the population growth. So, fit a quadratic model.


# QUADRATIC MODEL

# Define the quadratic model
data['Year_sq'] = data['Year'] ** 2
X_quadr = sm.add_constant(data[['Year', 'Year_sq']])
quadr_model = sm.OLS(data['Population'], X_quadr).fit()

# Summary of the quadratic model
print(quadr_model.summary())

                            OLS Regression Results                            
==============================================================================
Dep. Variable:             Population   R-squared:                       0.999
Model:                            OLS   Adj. R-squared:                  0.999
Method:                 Least Squares   F-statistic:                 1.389e+04
Date:                Thu, 01 Aug 2024   Prob (F-statistic):           1.67e-33
Time:                        16:31:20   Log-Likelihood:                -58.969
No. Observations:                  24   AIC:                             123.9
Df Residuals:                      21   BIC:                             127.5
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const        2.17e+04    522.731     41.515      0.000    2.06e+04    2.28e+04
Year         -24.1223      0.549    -43.914      0.000     -25.265     -22.980
Year_sq        0.0067      0.000     46.514      0.000       0.006       0.007
==============================================================================
Omnibus:                       10.491   Durbin-Watson:                   1.284
Prob(Omnibus):                  0.005   Jarque-Bera (JB):                8.556
Skew:                          -1.219   Prob(JB):                       0.0139
Kurtosis:                       4.616   Cond. No.                     3.09e+09
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 3.09e+09. This might indicate that there are
strong multicollinearity or other numerical problems.

# A higher R-squared is not surprising. It will always increase when we add
# new variables to the model. The fair criterion is Adjusted R-squared, when
# we compare models with a different number of parameters. Quadratic model has
# Adjusted R-squared = 0.999 comparing with 0.9155 for the linear model.
# Now let’s obtain the fitted values and plot the fitted curve.

# Plot the quadratic fit
plt.scatter(data['Year'], data['Population'])
plt.plot(data['Year'], quadr_model.predict(X_quadr), color='blue', linewidth=3)
plt.xlabel('Year')
plt.ylabel('Population')
plt.show()

# The quadratic curve fits nearly perfectly. The quadratic growth slowed down only during
# the World War II, although Baby Boomers quickly caught up with the trend.

# Prediction for the year 2030 using the quadratic model
pred_quadr_2030 = quadr_model.predict([1, 2030, 2030 ** 2])[0]
print(f"Quadratic model prediction for 2030: {pred_quadr_2030}")

# Confidence interval for the prediction
pred_quadr_2030_conf = quadr_model.get_prediction([1, 2030, 2030 ** 2]).conf_int()
print(f"Confidence interval for the prediction for 2030: {pred_quadr_2030_conf}")

# Prediction interval for the prediction
pred_quadr_2030_pred = quadr_model.get_prediction([1, 2030, 2030 ** 2]).summary_frame(alpha=0.05)
print(f"Prediction interval for the prediction for 2030: {pred_quadr_2030_pred}")

Quadratic model prediction for 2030: 364.1572134257949
Confidence interval for the prediction for 2030: [[359.96856497 368.34586188]]
Prediction interval for the prediction for 2030:          mean   mean_se  mean_ci_lower  mean_ci_upper  obs_ci_lower  \
0  364.157213  2.014147     359.968565     368.345862    356.610212   

   obs_ci_upper  
0    371.704215

# Food for thought... Are the confidence and predictions intervals valid here?